kaggle <- read.csv("kaggle.csv")
str(kaggle)
## 'data.frame': 23997 obs. of 57 variables:
## $ Compensation : chr "" "" "" ">$1,000,000" ...
## $ Age : chr "30-34" "30-34" "18-21" "30-34" ...
## $ Gender : chr "Man" "Man" "Man" "Man" ...
## $ Country : chr "India" "Algeria" "Egypt" "Egypt" ...
## $ Student : chr "No" "No" "Yes" "No" ...
## $ Years.Programming : chr "" "1-3 years" "1-3 years" "10-20 years" ...
## $ Similar.Title : chr "" "" "" "Machine Learning/ MLops Engineer" ...
## $ Industry.of.Work : chr "" "" "" "Other" ...
## $ Company.Size : chr "" "" "" "0-49 employees" ...
## $ How.many.individuals.are.responsible: chr "" "" "" "0" ...
## $ Incorporate.Machine.Learning : chr "" "" "" "I do not know" ...
## $ Years.Used.Machine.Learning : chr "" "Under 1 year" "1-2 years" "5-10 years" ...
## $ Helpful.University : chr "" "University courses" "" "" ...
## $ Helpful.Online.Courses : chr "" "" "Online courses (Coursera, EdX, etc)" "Online courses (Coursera, EdX, etc)" ...
## $ Helpful.Social.Media : chr "" "" "" "" ...
## $ Helpful.Video.Platform : chr "" "" "Video platforms (YouTube, Twitch, etc)" "Video platforms (YouTube, Twitch, etc)" ...
## $ Helpful.Kaggle : chr "" "Kaggle (notebooks, competitions, etc)" "Kaggle (notebooks, competitions, etc)" "Kaggle (notebooks, competitions, etc)" ...
## $ Helpful.None : chr "" "" "" "" ...
## $ Media.on.Social.Twitter : chr "" "" "Twitter (data science influencers)" "" ...
## $ Media.on.Social.Email.Newsletters : chr "" "" "Email newsletters (Data Elixir, O'Reilly Data & AI, etc)" "" ...
## $ Media.on.Reddit : chr "" "" "" "" ...
## $ Media.on.Kaggle : chr "" "" "Kaggle (notebooks, forums, etc)" "Kaggle (notebooks, forums, etc)" ...
## $ Media.on.Course.Forums : chr "" "" "" "" ...
## $ Media.on.Youtube : chr "" "" "YouTube (Kaggle YouTube, Cloud AI Adventures, etc)" "YouTube (Kaggle YouTube, Cloud AI Adventures, etc)" ...
## $ Media.on.Podcasts : chr "" "" "Podcasts (Chai Time Data Science, O’Reilly Data Show, etc)" "" ...
## $ Media.on.Blogs : chr "" "" "" "Blogs (Towards Data Science, Analytics Vidhya, etc)" ...
## $ Media.on.Journal.Publications : chr "" "" "" "Journal Publications (peer-reviewed journals, conference proceedings, etc)" ...
## $ Media.on.Slack.Communities : chr "" "" "" "Slack Communities (ods.ai, kagglenoobs, etc)" ...
## $ No.Media.Sources : chr "" "" "" "" ...
## $ Data.Science.on.Coursera : chr "" "" "Coursera" "Coursera" ...
## $ Data.Science.on.edX : chr "" "" "edX" "" ...
## $ Data.Science.on.Kaggle.Learn.Courses: chr "" "" "" "" ...
## $ Data.Science.on.DataCamp : chr "" "" "DataCamp" "DataCamp" ...
## $ Data.Science.on.Fast.ai : chr "" "" "" "" ...
## $ Data.Science.on.Udacity : chr "" "" "Udacity" "" ...
## $ Data.Science.on.Udemy : chr "" "" "Udemy" "" ...
## $ Data.Science.on.LinkedIn.Learning : chr "" "" "LinkedIn Learning" "" ...
## $ Cloud.certification.programs : chr "" "" "" "" ...
## $ Data.Science.University.Courses : chr "" "University Courses (resulting in a university degree)" "University Courses (resulting in a university degree)" "" ...
## $ No.Data.Science.Courses : chr "" "" "" "" ...
## $ Highest.Level.of.Formal.Education : chr "" "Master’s degree" "Bachelor’s degree" "No formal education past high school" ...
## $ Published.Academic.Research.Papers : chr "" "Yes" "" "" ...
## $ Python : chr "" "" "Python" "Python" ...
## $ R : chr "" "" "" "" ...
## $ SQL : chr "" "" "SQL" "" ...
## $ C : chr "" "" "C" "C" ...
## $ C. : chr "" "" "" "" ...
## $ C.. : chr "" "" "" "C++" ...
## $ Java : chr "" "Java" "" "Java" ...
## $ Javascript : chr "" "" "" "Javascript" ...
## $ Bash : chr "" "" "" "Bash" ...
## $ PHP : chr "" "" "" "PHP" ...
## $ MATLAB : chr "" "" "MATLAB" "MATLAB" ...
## $ Julia : chr "" "" "" "" ...
## $ Go : chr "" "" "" "" ...
## $ No.Programming.Languages : chr "" "" "" "" ...
## $ ML.Hubs...Repositories.Used : chr "" "" "" " Huggingface Models " ...
kaggle <- kaggle %>%
relocate('Highest.Level.of.Formal.Education', .before = 'Helpful.University') %>%
relocate('ML.Hubs...Repositories.Used', .before = 'Highest.Level.of.Formal.Education')
#Give binary variables 1 and 0
kaggle <- kaggle %>%
mutate_at(vars(15:57),~ifelse(. == "", 0, 1))
kaggle <- kaggle %>% #columns 5 to binary
mutate(Student = ifelse(Student == "Yes", 1,0))
#Turn our ordinal variables into factor (character currently)
kaggle <- kaggle %>%
mutate(across(c(2:4, 6:14), as.factor))
#Turn our predictor into compensation (currently a bin variable)
kaggle$Compensation <- as.factor(kaggle$Compensation)
#Add in NA values for empty cells. Will make imputation easier
kaggle <- kaggle %>%
mutate_at(c("Compensation"), ~na_if(., ""))
#print out our changes
str(kaggle)
## 'data.frame': 23997 obs. of 57 variables:
## $ Compensation : Factor w/ 27 levels "",">$1,000,000",..: NA NA NA 2 NA NA NA 2 2 NA ...
## $ Age : Factor w/ 11 levels "18-21","22-24",..: 4 4 1 4 7 1 1 3 7 2 ...
## $ Gender : Factor w/ 5 levels "Man","Nonbinary",..: 1 1 1 1 1 5 1 1 1 1 ...
## $ Country : Factor w/ 58 levels "Algeria","Argentina",..: 21 1 14 14 21 21 21 48 56 35 ...
## $ Student : num 0 0 1 0 1 1 1 0 0 1 ...
## $ Years.Programming : Factor w/ 8 levels "","< 1 years",..: 1 3 3 4 7 3 3 2 7 6 ...
## $ Similar.Title : Factor w/ 16 levels "","Currently not employed",..: 1 1 1 10 1 1 1 7 11 1 ...
## $ Industry.of.Work : Factor w/ 16 levels "","Academics/Education",..: 1 1 1 14 1 1 1 5 5 1 ...
## $ Company.Size : Factor w/ 6 levels "","0-49 employees",..: 1 1 1 2 1 1 1 2 2 1 ...
## $ How.many.individuals.are.responsible: Factor w/ 8 levels "","0","14-Oct",..: 1 1 1 2 1 1 1 2 5 1 ...
## $ Incorporate.Machine.Learning : Factor w/ 7 levels "","I do not know",..: 1 1 1 2 1 1 1 4 3 1 ...
## $ Years.Used.Machine.Learning : Factor w/ 10 levels "","1-2 years",..: 1 10 2 8 9 2 10 10 9 6 ...
## $ ML.Hubs...Repositories.Used : Factor w/ 10 levels ""," TensorFlow Hub ",..: 1 1 1 3 1 5 1 5 1 2 ...
## $ Highest.Level.of.Formal.Education : Factor w/ 8 levels "","Bachelor’s degree",..: 1 5 2 6 2 5 8 4 5 2 ...
## $ Helpful.University : num 0 1 0 0 1 0 0 0 0 1 ...
## $ Helpful.Online.Courses : num 0 0 1 1 0 1 0 1 0 1 ...
## $ Helpful.Social.Media : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Helpful.Video.Platform : num 0 0 1 1 0 0 1 1 0 0 ...
## $ Helpful.Kaggle : num 0 1 1 1 0 0 1 1 1 1 ...
## $ Helpful.None : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Media.on.Social.Twitter : num 0 0 1 0 0 0 0 1 0 0 ...
## $ Media.on.Social.Email.Newsletters : num 0 0 1 0 0 0 0 1 0 0 ...
## $ Media.on.Reddit : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Media.on.Kaggle : num 0 0 1 1 0 1 0 1 0 1 ...
## $ Media.on.Course.Forums : num 0 0 0 0 0 1 0 1 0 0 ...
## $ Media.on.Youtube : num 0 0 1 1 0 1 1 1 0 0 ...
## $ Media.on.Podcasts : num 0 0 1 0 0 0 0 1 0 0 ...
## $ Media.on.Blogs : num 0 0 0 1 1 1 1 1 0 0 ...
## $ Media.on.Journal.Publications : num 0 0 0 1 0 0 0 1 0 0 ...
## $ Media.on.Slack.Communities : num 0 0 0 1 0 0 0 1 0 0 ...
## $ No.Media.Sources : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Data.Science.on.Coursera : num 0 0 1 1 0 1 0 1 0 1 ...
## $ Data.Science.on.edX : num 0 0 1 0 0 1 0 0 0 0 ...
## $ Data.Science.on.Kaggle.Learn.Courses: num 0 0 0 0 0 0 1 1 1 1 ...
## $ Data.Science.on.DataCamp : num 0 0 1 1 0 1 0 0 0 1 ...
## $ Data.Science.on.Fast.ai : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Data.Science.on.Udacity : num 0 0 1 0 0 1 0 0 0 0 ...
## $ Data.Science.on.Udemy : num 0 0 1 0 0 1 0 0 0 1 ...
## $ Data.Science.on.LinkedIn.Learning : num 0 0 1 0 0 0 0 0 0 1 ...
## $ Cloud.certification.programs : num 0 0 0 0 0 0 0 0 0 1 ...
## $ Data.Science.University.Courses : num 0 1 1 0 0 0 0 0 0 0 ...
## $ No.Data.Science.Courses : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Published.Academic.Research.Papers : num 0 1 0 0 0 1 0 0 1 0 ...
## $ Python : num 0 0 1 1 1 1 1 1 1 1 ...
## $ R : num 0 0 0 0 0 0 0 1 0 1 ...
## $ SQL : num 0 0 1 0 0 1 1 1 0 1 ...
## $ C : num 0 0 1 1 0 0 0 1 1 1 ...
## $ C. : num 0 0 0 0 0 0 0 1 0 0 ...
## $ C.. : num 0 0 0 1 1 0 1 1 0 1 ...
## $ Java : num 0 1 0 1 1 0 0 1 0 0 ...
## $ Javascript : num 0 0 0 1 0 0 1 1 0 0 ...
## $ Bash : num 0 0 0 1 0 0 0 0 0 1 ...
## $ PHP : num 0 0 0 1 0 0 0 1 0 0 ...
## $ MATLAB : num 0 0 1 1 0 0 0 0 0 0 ...
## $ Julia : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Go : num 0 0 0 0 0 0 0 0 0 0 ...
## $ No.Programming.Languages : num 0 0 0 0 0 0 0 0 0 0 ...
#plot our stuff. Truthfully go this from chat gpt
create_distribution_chart <- function(variable) {
ggplot(kaggle, aes(x = !!sym(variable))) +
geom_histogram(binwidth = 1, fill = "lightseagreen", color = "turquoise4", alpha = 0.7, stat = 'count') +
labs(title = paste("Distribution Chart -", variable),
x = variable,
y = "Frequency")}
# Get the list of variable names
variable_names <- names(kaggle)
# Create distribution charts for all variables
charts <- purrr::map(variable_names, create_distribution_chart)
# Print or display the charts (you can use other functions like ggsave to save them to files)
print(charts)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
we see that 66% of our data has nulls for our predictor variable. Personally I do not want to lose out on 67% of data. Will imputate
#turn our bins into continuous numbers. We set a range/bounds for our data, and then take a random number within that bounds and assign it to a value
kaggle <- kaggle %>%
mutate(across(1,~ifelse(. == "$0-999", runif(1112,250,999),
ifelse(.== "1,000-1,999", runif(444,1250,1999),
ifelse(.=="2,000-2,999", runif(271,2250,2999),
ifelse(.=="3,000-3,999", runif(244,3250,3999),
ifelse(.=="4,000-4,999", runif(234,4250,4999),
ifelse(.=="5,000-7,499", runif(391,5000,7499),
ifelse(.=="7,500-9,999", runif(362,7500,9999),
ifelse(.=="10,000-14,999", runif(493,10000,14999),
ifelse(.=="15,000-19,999", runif(299,15000,19999),
ifelse(.=="20,000-24,999", runif(337,20000,24999),
ifelse(.=="25,000-29,999", runif(277,25000,29999),
ifelse(.=="30,000-39,999", runif(464,30000,39999),
ifelse(.=="40,000-49,999", runif(421,40000,49999),
ifelse(.=="50,000-59,999", runif(366,50000,59999),
ifelse(.=="60,000-69,999", runif(318,60000,69999),
ifelse(.=="70,000-79,999", runif(289,70000,79999),
ifelse(.=="80,000-89,999", runif(222,80000,89999),
ifelse(.=="90,000-99,999", runif(197,90000,99999),
ifelse(.=="100,000-124,999", runif(493,100000,124999),
ifelse(.=="125,000-149,999", runif(269,125000,149999),
ifelse(.=="150,000-199,999", runif(342,150000,199999),
ifelse(.=="200,000-249,999", runif(155,200000,249999),
ifelse(.=="250,000-299,999", runif(78,250000,299999),
ifelse(.=="300,000-499,999", runif(76,300000,499999),
ifelse(.=="$500,000-999,999", runif(48,500000,999999),
ifelse(.==">$1,000,000", runif(23,1000000,3000000),0))))))))))))))))))))))))))))
did this method to make sure variability is not reduced. If we used median or mean, we would have numerous observations of the same value, which decreases variability a lot. Variability is needed for OLS assumptions
set.seed(458)
# an imputation model
impute_model <- mice(kaggle, m = 3, maxit = 5, meth = "cart", target = "Compensation")
##
## iter imp variable
## 1 1 Compensation
## 1 2 Compensation
## 1 3 Compensation
## 2 1 Compensation
## 2 2 Compensation
## 2 3 Compensation
## 3 1 Compensation
## 3 2 Compensation
## 3 3 Compensation
## 4 1 Compensation
## 4 2 Compensation
## 4 3 Compensation
## 5 1 Compensation
## 5 2 Compensation
## 5 3 Compensation
## Warning: Number of logged events: 15
#Generate imputed datasets
kaggle <- complete(impute_model)
This mice method uses cart, which creates a regression for each of the variables to make sure it accurately imputes. Did not wanna use k-means, as it would give less variability.
write.csv(kaggle, "kaggleContinuous(1).csv")
I have kaggleContinuous.csv. Instead of overwriting that csv, I added a (1) so new data would be there
create_distribution_chart <- function(variable) {
# Calculate frequencies and percentages
data_summary <- kaggle %>%
group_by(!!sym(variable)) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100)
ggplot(data_summary, aes(x = !!sym(variable), y = percentage)) +
geom_bar(stat = 'identity', fill = "#1d9da5", color = "#449999", alpha = 0.7) +
geom_text(aes(label = sprintf("%.1f%%", percentage)),
position = position_stack(vjust = 0.5),
size = 3) +
labs(title = paste("Distribution Chart -", variable),
x = variable,
y = "Percentage") +
scale_y_continuous(labels = scales::percent_format(scale = 1))
}
# Get the list of variable names
variable_names <- names(kaggle)
# Create distribution charts for all variables
charts <- purrr::map(variable_names, create_distribution_chart)
# Print or display the charts (you can use other functions like ggsave to save them to files)
print(charts)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
notice our new distributions. So much prettier.
#select numeric values --> only those work for cor plots
kaggle1 <- kaggle %>%
dplyr::select_if(is.numeric)
cor <- cor(kaggle1)
corrplot(cor, method="color", col=colorRampPalette(c("gray27","white","#449999"))(100),cl.lim=c(0,1), tl.col = '#1d9da5')
lm <- lm(Compensation~., data = kaggle)
summary(lm)
vif(lm)
r^2 of .23 is not too bad. But definitely a lot missing. Based off
the vif, there is a lot of inflation between our variables and their
variances. I will be getting rid of
Published.Academic.Research.Papers, How.many.individuals.are.responsible, Company.Size,Years.Used.Machine.Learning, Similar.Title, and Industry.of.Work.
This gets rid of 6 variables.
#removes inflated variables
kaggle <- kaggle %>%
select(-c(Published.Academic.Research.Papers, How.many.individuals.are.responsible, Company.Size,Years.Used.Machine.Learning, Similar.Title, Industry.of.Work))
lm <- lm(Compensation~., data = kaggle)
summary(lm)
vif(lm)
r^2 dropped by 1 point, which isnt marignally large. VIF looks way better, but im keeping years programming in (VIF of 24, our highest currently). I also completely forgot to check for outliers. Compensation as a good sporatic few from 500000 to max numbers. Lets get rid of those
#filter out observations with compensation 500,000 and above
kaggle <- kaggle %>%
filter(Compensation <= 500000)
#want to see new model
summary(lm(Compensation~., data = kaggle))
r^2 increased to .46. A huuuge upgrade. Nice
set.seed(458)
train_indices <- createDataPartition(kaggle$Compensation, p = 0.6, list = FALSE)
trainData <- kaggle[train_indices, ]
tempData <- kaggle[-train_indices,]
validation_indices <- createDataPartition(tempData$Compensation, p = .5, list = FALSE, times = 1)
validationData <- tempData[validation_indices, ]
test_indices <- createDataPartition(validationData$Compensation, p = .5, list = FALSE, times = 1)
testData <- tempData[validation_indices,]
lmTrain <- lm(Compensation~., data = trainData)
summary(lmTrain)
lm(Compensation~., data = trainData) %>%
tidy() %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 11609.815645 | 7836.2876 | 1.4815454 | 0.1384835 |
| Age22-24 | -5976.072423 | 1170.5283 | -5.1054491 | 0.0000003 |
| Age25-29 | -2848.869176 | 1288.1800 | -2.2115460 | 0.0270139 |
| Age30-34 | 29.303920 | 1484.7600 | 0.0197365 | 0.9842539 |
| Age35-39 | -353.206940 | 1612.8983 | -0.2189890 | 0.8266618 |
| Age40-44 | -408.153980 | 1724.1126 | -0.2367328 | 0.8128675 |
| Age45-49 | -291.225984 | 1969.4702 | -0.1478702 | 0.8824473 |
| Age50-54 | 5639.952521 | 2231.8713 | 2.5270062 | 0.0115147 |
| Age55-59 | -276.017117 | 2555.6977 | -0.1080007 | 0.9139967 |
| Age60-69 | 3248.673435 | 2763.4894 | 1.1755693 | 0.2397867 |
| Age70+ | -9823.527188 | 4786.4835 | -2.0523475 | 0.0401542 |
| GenderNonbinary | 7490.414653 | 5929.5745 | 1.2632297 | 0.2065274 |
| GenderPrefer not to say | -2040.746991 | 2861.7693 | -0.7131067 | 0.4757914 |
| GenderPrefer to self-describe | -8192.943691 | 11548.6393 | -0.7094294 | 0.4780697 |
| GenderWoman | -2882.686317 | 835.0523 | -3.4521029 | 0.0005579 |
| CountryArgentina | 4794.410129 | 8313.8976 | 0.5766742 | 0.5641687 |
| CountryAustralia | 90125.117685 | 8788.2225 | 10.2552157 | 0.0000000 |
| CountryBangladesh | 930.738692 | 8177.0621 | 0.1138231 | 0.9093796 |
| CountryBelgium | 9394.209437 | 10188.4652 | 0.9220436 | 0.3565215 |
| CountryBrazil | 2108.908490 | 7653.2223 | 0.2755582 | 0.7828914 |
| CountryCameroon | 8086.397222 | 9738.9107 | 0.8303184 | 0.4063727 |
| CountryCanada | 48182.807259 | 8150.5963 | 5.9115684 | 0.0000000 |
| CountryChile | 5592.973705 | 8732.3594 | 0.6404883 | 0.5218656 |
| CountryChina | 6688.929161 | 7801.4265 | 0.8573982 | 0.3912393 |
| CountryColombia | 3171.920314 | 8094.6832 | 0.3918523 | 0.6951732 |
| CountryCzech Republic | 8366.889295 | 10267.3507 | 0.8149025 | 0.4151418 |
| CountryEcuador | -381.341373 | 10504.2498 | -0.0363035 | 0.9710408 |
| CountryEgypt | 3443.889106 | 7947.4143 | 0.4333345 | 0.6647783 |
| CountryEthiopia | 4214.205596 | 9005.6529 | 0.4679511 | 0.6398267 |
| CountryFrance | 15373.827628 | 8134.4875 | 1.8899565 | 0.0587841 |
| CountryGermany | 20152.101226 | 9244.7998 | 2.1798310 | 0.0292864 |
| CountryGhana | 6935.779915 | 8914.2843 | 0.7780524 | 0.4365511 |
| CountryHong Kong (S.A.R.) | 41653.867854 | 10105.5255 | 4.1218903 | 0.0000378 |
| CountryI do not wish to disclose my location | 1810.893775 | 10726.0297 | 0.1688317 | 0.8659314 |
| CountryIndia | 3529.329151 | 7454.5444 | 0.4734467 | 0.6359018 |
| CountryIndonesia | 3749.314956 | 7892.1598 | 0.4750683 | 0.6347456 |
| CountryIran, Islamic Republic of… | -3195.371245 | 8913.8923 | -0.3584709 | 0.7199962 |
| CountryIreland | 15762.926315 | 10108.0893 | 1.5594368 | 0.1189153 |
| CountryIsrael | 78581.138352 | 9242.9175 | 8.5017678 | 0.0000000 |
| CountryItaly | 3172.978133 | 8321.9617 | 0.3812777 | 0.7030029 |
| CountryJapan | 16636.166767 | 7787.1093 | 2.1363726 | 0.0326661 |
| CountryKenya | 2828.478527 | 8261.9300 | 0.3423508 | 0.7320920 |
| CountryMalaysia | 6803.101987 | 9401.3847 | 0.7236277 | 0.4693063 |
| CountryMexico | 3372.660760 | 7902.5291 | 0.4267825 | 0.6695443 |
| CountryMorocco | 8838.103330 | 8371.2229 | 1.0557721 | 0.2910903 |
| CountryNepal | 3723.224295 | 9328.4447 | 0.3991259 | 0.6898064 |
| CountryNetherlands | 16887.991175 | 8959.1731 | 1.8849944 | 0.0594510 |
| CountryNigeria | 2159.304887 | 7685.4796 | 0.2809590 | 0.7787459 |
| CountryOther | 8533.257298 | 7556.1862 | 1.1293074 | 0.2587872 |
| CountryPakistan | 1962.592147 | 7699.1557 | 0.2549100 | 0.7987963 |
| CountryPeru | -6041.812885 | 8912.7206 | -0.6778865 | 0.4978547 |
| CountryPhilippines | 2505.910945 | 9098.1313 | 0.2754314 | 0.7829888 |
| CountryPoland | 4794.169940 | 8806.8272 | 0.5443697 | 0.5861956 |
| CountryPortugal | -6737.836549 | 9413.5034 | -0.7157629 | 0.4741495 |
| CountryRomania | -8293.129624 | 10804.0269 | -0.7675962 | 0.4427399 |
| CountryRussia | 5328.528160 | 7970.2704 | 0.6685505 | 0.5037931 |
| CountrySaudi Arabia | 23078.299365 | 9178.6877 | 2.5143354 | 0.0119367 |
| CountrySingapore | 25752.739961 | 9812.5392 | 2.6244726 | 0.0086876 |
| CountrySouth Africa | 6407.598962 | 9185.3191 | 0.6975913 | 0.4854442 |
| CountrySouth Korea | 11465.095905 | 7970.0884 | 1.4385155 | 0.1503099 |
| CountrySpain | 9180.253781 | 8073.0033 | 1.1371547 | 0.2554928 |
| CountrySri Lanka | -10566.351955 | 9403.3159 | -1.1236836 | 0.2611663 |
| CountryTaiwan | 6174.908675 | 8190.3561 | 0.7539243 | 0.4509072 |
| CountryThailand | 6239.169456 | 8672.7639 | 0.7193981 | 0.4719075 |
| CountryTunisia | 10074.488964 | 8712.5225 | 1.1563229 | 0.2475686 |
| CountryTurkey | 2960.712011 | 7934.9800 | 0.3731215 | 0.7090636 |
| CountryUkraine | 1003.228565 | 9518.3962 | 0.1053989 | 0.9160608 |
| CountryUnited Arab Emirates | 19958.040702 | 9099.9444 | 2.1932047 | 0.0283088 |
| CountryUnited Kingdom of Great Britain and Northern Ireland | 34086.423472 | 8047.4428 | 4.2356839 | 0.0000229 |
| CountryUnited States of America | 91355.731663 | 7515.4444 | 12.1557325 | 0.0000000 |
| CountryViet Nam | -170.696465 | 8273.8481 | -0.0206308 | 0.9835404 |
| CountryZimbabwe | -1677.418088 | 11282.6020 | -0.1486730 | 0.8818138 |
| Student | 1006.985503 | 1172.6925 | 0.8586953 | 0.3905232 |
| Years.Programming< 1 years | 1666.976861 | 4284.8915 | 0.3890360 | 0.6972554 |
| Years.Programming1-3 years | 3189.779807 | 4294.8075 | 0.7427061 | 0.4576719 |
| Years.Programming10-20 years | 28979.885268 | 4477.3392 | 6.4725687 | 0.0000000 |
| Years.Programming20+ years | 42121.244728 | 4592.0178 | 9.1727094 | 0.0000000 |
| Years.Programming3-5 years | 6312.259366 | 4348.8892 | 1.4514648 | 0.1466726 |
| Years.Programming5-10 years | 12695.918584 | 4399.3700 | 2.8858492 | 0.0039095 |
| Years.ProgrammingI have never written code | 2127.701987 | 4264.5792 | 0.4989243 | 0.6178405 |
| Incorporate.Machine.LearningI do not know | -8192.178478 | 1703.6691 | -4.8085503 | 0.0000015 |
| Incorporate.Machine.LearningNo (we do not use ML methods) | -6378.255853 | 1525.2370 | -4.1818129 | 0.0000291 |
| Incorporate.Machine.LearningWe are exploring ML methods (and may one day put a model into production) | -458.942288 | 1689.4868 | -0.2716460 | 0.7858982 |
| Incorporate.Machine.LearningWe have well established ML methods (i.e., models in production for more than 2 years) | 26170.382216 | 1661.7037 | 15.7491267 | 0.0000000 |
| Incorporate.Machine.LearningWe recently started using ML methods (i.e., models in production for less than 2 years) | 11751.409124 | 1827.2742 | 6.4311142 | 0.0000000 |
| Incorporate.Machine.LearningWe use ML methods for generating insights (but do not put working models into production) | 2854.386769 | 1988.7993 | 1.4352312 | 0.1512432 |
| ML.Hubs…Repositories.Used TensorFlow Hub | -559.001995 | 1886.8538 | -0.2962614 | 0.7670348 |
| ML.Hubs…Repositories.Used Huggingface Models | 6444.232908 | 2400.9173 | 2.6840712 | 0.0072816 |
| ML.Hubs…Repositories.Used Jumpstart | 186.448625 | 10323.9098 | 0.0180599 | 0.9855913 |
| ML.Hubs…Repositories.Used Kaggle datasets | -148.637713 | 1410.2338 | -0.1053993 | 0.9160604 |
| ML.Hubs…Repositories.Used NVIDIA NGC models | 1020.039924 | 6199.3380 | 0.1645401 | 0.8693083 |
| ML.Hubs…Repositories.Used ONNX models | 2125.877385 | 8365.1746 | 0.2541343 | 0.7993955 |
| ML.Hubs…Repositories.Used PyTorch Hub | 2232.127475 | 2673.4229 | 0.8349324 | 0.4037698 |
| ML.Hubs…Repositories.Used Timm | 2424.112325 | 4644.6992 | 0.5219094 | 0.6017415 |
| ML.Hubs…Repositories.UsedOther storage services (i.e. google drive) | -1685.574295 | 5189.2098 | -0.3248229 | 0.7453199 |
| Highest.Level.of.Formal.EducationBachelor’s degree | -3643.970189 | 4605.9387 | -0.7911460 | 0.4288720 |
| Highest.Level.of.Formal.EducationDoctoral degree | -1309.527980 | 4670.0898 | -0.2804075 | 0.7791690 |
| Highest.Level.of.Formal.EducationI prefer not to answer | -4695.194450 | 4741.5888 | -0.9902154 | 0.3220857 |
| Highest.Level.of.Formal.EducationMaster’s degree | -1270.036986 | 4579.8685 | -0.2773086 | 0.7815472 |
| Highest.Level.of.Formal.EducationNo formal education past high school | 504.415958 | 5055.6180 | 0.0997734 | 0.9205257 |
| Highest.Level.of.Formal.EducationProfessional doctorate | -3307.094253 | 4971.1934 | -0.6652516 | 0.5059004 |
| Highest.Level.of.Formal.EducationSome college/university study without earning a bachelor’s degree | -2885.111336 | 4731.1667 | -0.6098097 | 0.5419976 |
| Helpful.University | 448.256012 | 874.0432 | 0.5128534 | 0.6080619 |
| Helpful.Online.Courses | -1550.934856 | 784.7419 | -1.9763630 | 0.0481330 |
| Helpful.Social.Media | 1097.837710 | 1020.1660 | 1.0761363 | 0.2818846 |
| Helpful.Video.Platform | -1173.516744 | 737.8836 | -1.5903820 | 0.1117710 |
| Helpful.Kaggle | -2155.451199 | 776.4901 | -2.7758901 | 0.0055123 |
| Helpful.None | -1527.072611 | 1961.0898 | -0.7786857 | 0.4361778 |
| Media.on.Social.Twitter | 1132.725430 | 994.7634 | 1.1386883 | 0.2548524 |
| Media.on.Social.Email.Newsletters | 922.138724 | 982.6021 | 0.9384661 | 0.3480209 |
| Media.on.Reddit | 1106.268754 | 1136.5545 | 0.9733530 | 0.3303944 |
| Media.on.Kaggle | 601.175857 | 801.2258 | 0.7503201 | 0.4530744 |
| Media.on.Course.Forums | -2896.466420 | 968.5463 | -2.9905296 | 0.0027897 |
| Media.on.Youtube | -1440.760950 | 782.8805 | -1.8403332 | 0.0657402 |
| Media.on.Podcasts | 2209.887974 | 1243.4238 | 1.7772604 | 0.0755468 |
| Media.on.Blogs | 1544.822954 | 813.3659 | 1.8992965 | 0.0575457 |
| Media.on.Journal.Publications | 972.504372 | 1013.4822 | 0.9595673 | 0.3372893 |
| Media.on.Slack.Communities | -946.337671 | 1367.0703 | -0.6922377 | 0.4887994 |
| No.Media.Sources | -3801.858218 | 1604.7920 | -2.3690661 | 0.0178464 |
| Data.Science.on.Coursera | 1471.069736 | 807.0474 | 1.8227797 | 0.0683578 |
| Data.Science.on.edX | 1904.456670 | 1188.2840 | 1.6026949 | 0.1090243 |
| Data.Science.on.Kaggle.Learn.Courses | -1185.328712 | 835.8581 | -1.4180981 | 0.1561841 |
| Data.Science.on.DataCamp | 477.655048 | 1003.3527 | 0.4760590 | 0.6340397 |
| Data.Science.on.Fast.ai | 5999.877664 | 1847.8215 | 3.2470006 | 0.0011690 |
| Data.Science.on.Udacity | 762.043662 | 1275.4122 | 0.5974881 | 0.5501911 |
| Data.Science.on.Udemy | -773.137435 | 840.2486 | -0.9201294 | 0.3575208 |
| Data.Science.on.LinkedIn.Learning | -181.414377 | 1110.3069 | -0.1633912 | 0.8702128 |
| Cloud.certification.programs | 4.381781 | 1337.4218 | 0.0032763 | 0.9973859 |
| Data.Science.University.Courses | 4541.194118 | 894.6426 | 5.0759870 | 0.0000004 |
| No.Data.Science.Courses | 228.616270 | 1279.2315 | 0.1787138 | 0.8581650 |
| Python | 978.681861 | 1197.3490 | 0.8173739 | 0.4137285 |
| R | -723.881913 | 916.1568 | -0.7901288 | 0.4294657 |
| SQL | 890.166373 | 775.2301 | 1.1482609 | 0.2508802 |
| C | -543.511535 | 1067.7065 | -0.5090458 | 0.6107280 |
| C. | -262.027092 | 1494.8646 | -0.1752848 | 0.8608583 |
| C.. | -866.829316 | 994.6779 | -0.8714674 | 0.3835137 |
| Java | -1870.630712 | 1005.8559 | -1.8597402 | 0.0629430 |
| Javascript | -2497.689869 | 1080.9607 | -2.3106205 | 0.0208681 |
| Bash | 3499.993194 | 1422.3391 | 2.4607304 | 0.0138773 |
| PHP | -2167.406105 | 1586.7687 | -1.3659244 | 0.1719843 |
| MATLAB | -2417.058235 | 1169.6502 | -2.0664796 | 0.0388013 |
| Julia | -3628.959547 | 3052.7389 | -1.1887553 | 0.2345559 |
| Go | 8078.002392 | 2938.8765 | 2.7486702 | 0.0059913 |
| No.Programming.Languages | -3198.954783 | 3427.0949 | -0.9334305 | 0.3506136 |
lmVal <- lm(Compensation~., data = validationData)
summary(lmVal)
lm(Compensation~., data = validationData) %>%
tidy() %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 8897.9640 | 9948.477 | 0.8944046 | 0.3711519 |
| Age22-24 | -6641.7633 | 2071.417 | -3.2063866 | 0.0013533 |
| Age25-29 | -4096.2519 | 2260.085 | -1.8124325 | 0.0699840 |
| Age30-34 | -3537.2873 | 2651.804 | -1.3339175 | 0.1822964 |
| Age35-39 | 1898.7355 | 2907.310 | 0.6530900 | 0.5137306 |
| Age40-44 | 1161.3335 | 3053.306 | 0.3803527 | 0.7037010 |
| Age45-49 | 1692.6353 | 3488.156 | 0.4852522 | 0.6275204 |
| Age50-54 | 10773.9332 | 3888.750 | 2.7705393 | 0.0056186 |
| Age55-59 | 4081.6783 | 4476.425 | 0.9118165 | 0.3619127 |
| Age60-69 | -1471.1032 | 4907.645 | -0.2997574 | 0.7643756 |
| Age70+ | -40897.3302 | 10008.430 | -4.0862883 | 0.0000446 |
| GenderNonbinary | -5557.7545 | 11477.723 | -0.4842210 | 0.6282519 |
| GenderPrefer not to say | 680.6658 | 5013.120 | 0.1357769 | 0.8920036 |
| GenderPrefer to self-describe | 5020.2854 | 15576.711 | 0.3222943 | 0.7472443 |
| GenderWoman | -3938.2537 | 1475.383 | -2.6693101 | 0.0076272 |
| CountryArgentina | 774.9395 | 11041.271 | 0.0701857 | 0.9440489 |
| CountryAustralia | 82905.3352 | 11681.904 | 7.0969025 | 0.0000000 |
| CountryBangladesh | 9487.2623 | 10524.251 | 0.9014667 | 0.3673870 |
| CountryBelgium | 8475.7239 | 15323.063 | 0.5531351 | 0.5801976 |
| CountryBrazil | 5382.2411 | 9688.006 | 0.5555572 | 0.5785404 |
| CountryCameroon | 16453.7945 | 13638.698 | 1.2064051 | 0.2277229 |
| CountryCanada | 50711.7904 | 10701.047 | 4.7389559 | 0.0000022 |
| CountryChile | 4120.2278 | 12338.383 | 0.3339358 | 0.7384431 |
| CountryChina | 6253.3730 | 10126.417 | 0.6175307 | 0.5369151 |
| CountryColombia | 10785.0263 | 10935.932 | 0.9862009 | 0.3240860 |
| CountryCzech Republic | 8461.5124 | 17884.744 | 0.4731134 | 0.6361545 |
| CountryEcuador | -5848.6259 | 15737.762 | -0.3716301 | 0.7101853 |
| CountryEgypt | 3332.1349 | 10091.961 | 0.3301772 | 0.7412810 |
| CountryEthiopia | 9055.8706 | 13429.313 | 0.6743361 | 0.5001313 |
| CountryFrance | 22578.1563 | 10612.860 | 2.1274338 | 0.0334367 |
| CountryGermany | 14268.9710 | 12502.019 | 1.1413333 | 0.2537902 |
| CountryGhana | 2344.5582 | 12215.172 | 0.1919382 | 0.8477990 |
| CountryHong Kong (S.A.R.) | 13041.5639 | 14186.021 | 0.9193250 | 0.3579734 |
| CountryI do not wish to disclose my location | 10732.3165 | 17004.586 | 0.6311425 | 0.5279785 |
| CountryIndia | 6586.2932 | 9172.129 | 0.7180768 | 0.4727461 |
| CountryIndonesia | 10870.0604 | 10289.728 | 1.0563992 | 0.2908409 |
| CountryIran, Islamic Republic of… | 15155.0025 | 12423.579 | 1.2198580 | 0.2225807 |
| CountryIreland | 71585.1765 | 15306.774 | 4.6766992 | 0.0000030 |
| CountryIsrael | 84411.9283 | 13467.861 | 6.2676564 | 0.0000000 |
| CountryItaly | 15009.6698 | 11690.808 | 1.2838864 | 0.1992458 |
| CountryJapan | 22445.9919 | 9912.273 | 2.2644647 | 0.0235915 |
| CountryKenya | 4312.6214 | 11135.139 | 0.3872984 | 0.6985531 |
| CountryMalaysia | 341.4790 | 15762.887 | 0.0216635 | 0.9827173 |
| CountryMexico | 8518.8282 | 10240.282 | 0.8318939 | 0.4055117 |
| CountryMorocco | 10769.8410 | 11493.216 | 0.9370607 | 0.3487761 |
| CountryNepal | 12159.2154 | 14825.492 | 0.8201559 | 0.4121694 |
| CountryNetherlands | 32911.9203 | 12615.158 | 2.6089185 | 0.0091120 |
| CountryNigeria | 11215.1186 | 9787.403 | 1.1458728 | 0.2519069 |
| CountryOther | 8877.0562 | 9410.733 | 0.9432906 | 0.3455814 |
| CountryPakistan | 4144.5560 | 9878.063 | 0.4195717 | 0.6748178 |
| CountryPeru | 8525.1296 | 11976.434 | 0.7118254 | 0.4766087 |
| CountryPhilippines | 2886.8810 | 12104.594 | 0.2384947 | 0.8115080 |
| CountryPoland | 20944.6120 | 12369.502 | 1.6932461 | 0.0904758 |
| CountryPortugal | 6404.8958 | 15838.306 | 0.4043927 | 0.6859426 |
| CountryRomania | 5931.5476 | 14541.765 | 0.4078974 | 0.6833679 |
| CountryRussia | 12386.0034 | 10259.910 | 1.2072234 | 0.2274077 |
| CountrySaudi Arabia | 20884.2801 | 14184.278 | 1.4723541 | 0.1409931 |
| CountrySingapore | 33555.9692 | 12901.838 | 2.6008673 | 0.0093284 |
| CountrySouth Africa | 7916.3527 | 11419.120 | 0.6932542 | 0.4881848 |
| CountrySouth Korea | 9116.4124 | 10497.266 | 0.8684559 | 0.3851897 |
| CountrySpain | 23098.4818 | 11100.611 | 2.0808298 | 0.0375042 |
| CountrySri Lanka | -12694.9592 | 16294.252 | -0.7791066 | 0.4359567 |
| CountryTaiwan | 10427.4800 | 10693.585 | 0.9751154 | 0.3295539 |
| CountryThailand | 1394.5859 | 12229.749 | 0.1140323 | 0.9092172 |
| CountryTunisia | 7549.6580 | 13015.713 | 0.5800418 | 0.5619146 |
| CountryTurkey | 1616.1771 | 10331.055 | 0.1564387 | 0.8756940 |
| CountryUkraine | 1237.6520 | 13679.324 | 0.0904761 | 0.9279128 |
| CountryUnited Arab Emirates | 23132.3573 | 12760.481 | 1.8128123 | 0.0699254 |
| CountryUnited Kingdom of Great Britain and Northern Ireland | 51562.5763 | 11518.027 | 4.4766849 | 0.0000078 |
| CountryUnited States of America | 97751.2547 | 9303.987 | 10.5063830 | 0.0000000 |
| CountryViet Nam | -352.9408 | 11003.137 | -0.0320764 | 0.9744125 |
| CountryZimbabwe | 511.2355 | 13919.532 | 0.0367279 | 0.9707035 |
| Student | 750.5338 | 2084.419 | 0.3600686 | 0.7188122 |
| Years.Programming< 1 years | -1719.6036 | 8647.572 | -0.1988539 | 0.8423857 |
| Years.Programming1-3 years | 722.9183 | 8652.776 | 0.0835476 | 0.9334198 |
| Years.Programming10-20 years | 27996.9691 | 8893.703 | 3.1479541 | 0.0016546 |
| Years.Programming20+ years | 33746.7610 | 9112.792 | 3.7032298 | 0.0002153 |
| Years.Programming3-5 years | 4905.9652 | 8744.225 | 0.5610520 | 0.5747892 |
| Years.Programming5-10 years | 8447.6049 | 8785.765 | 0.9615105 | 0.3363457 |
| Years.ProgrammingI have never written code | -248.1520 | 8604.153 | -0.0288410 | 0.9769927 |
| Incorporate.Machine.LearningI do not know | -3960.7968 | 2959.601 | -1.3382872 | 0.1808684 |
| Incorporate.Machine.LearningNo (we do not use ML methods) | -5077.9425 | 2691.129 | -1.8869188 | 0.0592336 |
| Incorporate.Machine.LearningWe are exploring ML methods (and may one day put a model into production) | 2358.5856 | 2851.009 | 0.8272811 | 0.4081203 |
| Incorporate.Machine.LearningWe have well established ML methods (i.e., models in production for more than 2 years) | 29841.0260 | 2994.593 | 9.9649682 | 0.0000000 |
| Incorporate.Machine.LearningWe recently started using ML methods (i.e., models in production for less than 2 years) | 6655.1265 | 3197.036 | 2.0816552 | 0.0374287 |
| Incorporate.Machine.LearningWe use ML methods for generating insights (but do not put working models into production) | 3676.2384 | 3521.212 | 1.0440264 | 0.2965276 |
| ML.Hubs…Repositories.Used TensorFlow Hub | 5550.5443 | 3420.097 | 1.6229199 | 0.1046745 |
| ML.Hubs…Repositories.Used Huggingface Models | 11069.2004 | 4461.641 | 2.4809709 | 0.0131377 |
| ML.Hubs…Repositories.Used Jumpstart | -14603.3567 | 28787.769 | -0.5072764 | 0.6119850 |
| ML.Hubs…Repositories.Used Kaggle datasets | -949.7629 | 2512.312 | -0.3780434 | 0.7054156 |
| ML.Hubs…Repositories.Used NVIDIA NGC models | -1940.7303 | 9985.857 | -0.1943479 | 0.8459120 |
| ML.Hubs…Repositories.Used ONNX models | 6766.3049 | 12398.032 | 0.5457564 | 0.5852597 |
| ML.Hubs…Repositories.Used PyTorch Hub | -3242.6296 | 4623.816 | -0.7012887 | 0.4831581 |
| ML.Hubs…Repositories.Used Timm | 11077.5492 | 8159.729 | 1.3575879 | 0.1746605 |
| ML.Hubs…Repositories.UsedOther storage services (i.e. google drive) | 2981.9750 | 9036.644 | 0.3299870 | 0.7414247 |
| Highest.Level.of.Formal.EducationBachelor’s degree | -704.0327 | 9083.008 | -0.0775110 | 0.9382204 |
| Highest.Level.of.Formal.EducationDoctoral degree | -1337.4739 | 9255.672 | -0.1445032 | 0.8851094 |
| Highest.Level.of.Formal.EducationI prefer not to answer | 1896.7834 | 9359.414 | 0.2026605 | 0.8394093 |
| Highest.Level.of.Formal.EducationMaster’s degree | -2507.5674 | 9042.283 | -0.2773158 | 0.7815500 |
| Highest.Level.of.Formal.EducationNo formal education past high school | 1095.0893 | 9723.973 | 0.1126175 | 0.9103387 |
| Highest.Level.of.Formal.EducationProfessional doctorate | 1815.7385 | 9750.094 | 0.1862278 | 0.8522743 |
| Highest.Level.of.Formal.EducationSome college/university study without earning a bachelor’s degree | -7060.9140 | 9367.752 | -0.7537469 | 0.4510395 |
| Helpful.University | -981.1163 | 1526.991 | -0.6425161 | 0.5205699 |
| Helpful.Online.Courses | 177.7580 | 1393.003 | 0.1276077 | 0.8984650 |
| Helpful.Social.Media | 1782.5842 | 1849.385 | 0.9638796 | 0.3351565 |
| Helpful.Video.Platform | -1476.7613 | 1305.861 | -1.1308714 | 0.2581677 |
| Helpful.Kaggle | -1928.5795 | 1383.019 | -1.3944711 | 0.1632422 |
| Helpful.None | 786.6436 | 3426.703 | 0.2295628 | 0.8184416 |
| Media.on.Social.Twitter | 3385.9781 | 1752.493 | 1.9320927 | 0.0534089 |
| Media.on.Social.Email.Newsletters | 232.0917 | 1737.398 | 0.1335858 | 0.8937359 |
| Media.on.Reddit | 196.0316 | 2042.547 | 0.0959741 | 0.9235453 |
| Media.on.Kaggle | 1302.6609 | 1438.275 | 0.9057109 | 0.3651360 |
| Media.on.Course.Forums | -2217.9694 | 1723.620 | -1.2868086 | 0.1982252 |
| Media.on.Youtube | -2567.4087 | 1381.660 | -1.8582064 | 0.0632030 |
| Media.on.Podcasts | 2010.0364 | 2227.739 | 0.9022763 | 0.3669570 |
| Media.on.Blogs | 333.7197 | 1431.588 | 0.2331116 | 0.8156850 |
| Media.on.Journal.Publications | -399.4962 | 1823.090 | -0.2191313 | 0.8265574 |
| Media.on.Slack.Communities | -1167.1066 | 2352.511 | -0.4961110 | 0.6198396 |
| No.Media.Sources | -3509.0622 | 2905.592 | -1.2076926 | 0.2272271 |
| Data.Science.on.Coursera | -456.0458 | 1435.822 | -0.3176200 | 0.7507875 |
| Data.Science.on.edX | 768.2341 | 2131.088 | 0.3604892 | 0.7184977 |
| Data.Science.on.Kaggle.Learn.Courses | -1021.5180 | 1482.574 | -0.6890166 | 0.4908473 |
| Data.Science.on.DataCamp | -1243.9259 | 1812.741 | -0.6862126 | 0.4926134 |
| Data.Science.on.Fast.ai | 923.0810 | 3275.566 | 0.2818081 | 0.7781033 |
| Data.Science.on.Udacity | 1051.6840 | 2231.709 | 0.4712461 | 0.6374872 |
| Data.Science.on.Udemy | -882.2995 | 1457.959 | -0.6051607 | 0.5451019 |
| Data.Science.on.LinkedIn.Learning | -113.6682 | 1993.927 | -0.0570072 | 0.9545419 |
| Cloud.certification.programs | 1597.7881 | 2311.804 | 0.6911435 | 0.4895099 |
| Data.Science.University.Courses | 3510.0109 | 1588.833 | 2.2091751 | 0.0272112 |
| No.Data.Science.Courses | 123.2877 | 2204.012 | 0.0559379 | 0.9553937 |
| Python | 1857.2647 | 2117.445 | 0.8771254 | 0.3804639 |
| R | 706.2501 | 1637.289 | 0.4313535 | 0.6662314 |
| SQL | 989.1151 | 1371.301 | 0.7212968 | 0.4707633 |
| C | 1512.9581 | 1874.749 | 0.8070190 | 0.4196969 |
| C. | -1880.7577 | 2539.441 | -0.7406188 | 0.4589621 |
| C.. | 2090.7774 | 1779.804 | 1.1747233 | 0.2401657 |
| Java | -1298.6844 | 1761.031 | -0.7374567 | 0.4608820 |
| Javascript | -1436.8006 | 1886.210 | -0.7617393 | 0.4462543 |
| Bash | 2284.5738 | 2515.254 | 0.9082874 | 0.3637736 |
| PHP | -1125.5821 | 2720.452 | -0.4137482 | 0.6790777 |
| MATLAB | 1830.8836 | 2055.728 | 0.8906254 | 0.3731764 |
| Julia | 1779.4135 | 6003.624 | 0.2963899 | 0.7669456 |
| Go | 6962.7998 | 5258.782 | 1.3240329 | 0.1855574 |
| No.Programming.Languages | -3075.5039 | 6153.472 | -0.4997997 | 0.6172398 |
lmTest <- lm(Compensation~., data = testData)
summary(lmTest)
lm(Compensation~., data = testData) %>%
tidy() %>%
kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 8897.9640 | 9948.477 | 0.8944046 | 0.3711519 |
| Age22-24 | -6641.7633 | 2071.417 | -3.2063866 | 0.0013533 |
| Age25-29 | -4096.2519 | 2260.085 | -1.8124325 | 0.0699840 |
| Age30-34 | -3537.2873 | 2651.804 | -1.3339175 | 0.1822964 |
| Age35-39 | 1898.7355 | 2907.310 | 0.6530900 | 0.5137306 |
| Age40-44 | 1161.3335 | 3053.306 | 0.3803527 | 0.7037010 |
| Age45-49 | 1692.6353 | 3488.156 | 0.4852522 | 0.6275204 |
| Age50-54 | 10773.9332 | 3888.750 | 2.7705393 | 0.0056186 |
| Age55-59 | 4081.6783 | 4476.425 | 0.9118165 | 0.3619127 |
| Age60-69 | -1471.1032 | 4907.645 | -0.2997574 | 0.7643756 |
| Age70+ | -40897.3302 | 10008.430 | -4.0862883 | 0.0000446 |
| GenderNonbinary | -5557.7545 | 11477.723 | -0.4842210 | 0.6282519 |
| GenderPrefer not to say | 680.6658 | 5013.120 | 0.1357769 | 0.8920036 |
| GenderPrefer to self-describe | 5020.2854 | 15576.711 | 0.3222943 | 0.7472443 |
| GenderWoman | -3938.2537 | 1475.383 | -2.6693101 | 0.0076272 |
| CountryArgentina | 774.9395 | 11041.271 | 0.0701857 | 0.9440489 |
| CountryAustralia | 82905.3352 | 11681.904 | 7.0969025 | 0.0000000 |
| CountryBangladesh | 9487.2623 | 10524.251 | 0.9014667 | 0.3673870 |
| CountryBelgium | 8475.7239 | 15323.063 | 0.5531351 | 0.5801976 |
| CountryBrazil | 5382.2411 | 9688.006 | 0.5555572 | 0.5785404 |
| CountryCameroon | 16453.7945 | 13638.698 | 1.2064051 | 0.2277229 |
| CountryCanada | 50711.7904 | 10701.047 | 4.7389559 | 0.0000022 |
| CountryChile | 4120.2278 | 12338.383 | 0.3339358 | 0.7384431 |
| CountryChina | 6253.3730 | 10126.417 | 0.6175307 | 0.5369151 |
| CountryColombia | 10785.0263 | 10935.932 | 0.9862009 | 0.3240860 |
| CountryCzech Republic | 8461.5124 | 17884.744 | 0.4731134 | 0.6361545 |
| CountryEcuador | -5848.6259 | 15737.762 | -0.3716301 | 0.7101853 |
| CountryEgypt | 3332.1349 | 10091.961 | 0.3301772 | 0.7412810 |
| CountryEthiopia | 9055.8706 | 13429.313 | 0.6743361 | 0.5001313 |
| CountryFrance | 22578.1563 | 10612.860 | 2.1274338 | 0.0334367 |
| CountryGermany | 14268.9710 | 12502.019 | 1.1413333 | 0.2537902 |
| CountryGhana | 2344.5582 | 12215.172 | 0.1919382 | 0.8477990 |
| CountryHong Kong (S.A.R.) | 13041.5639 | 14186.021 | 0.9193250 | 0.3579734 |
| CountryI do not wish to disclose my location | 10732.3165 | 17004.586 | 0.6311425 | 0.5279785 |
| CountryIndia | 6586.2932 | 9172.129 | 0.7180768 | 0.4727461 |
| CountryIndonesia | 10870.0604 | 10289.728 | 1.0563992 | 0.2908409 |
| CountryIran, Islamic Republic of… | 15155.0025 | 12423.579 | 1.2198580 | 0.2225807 |
| CountryIreland | 71585.1765 | 15306.774 | 4.6766992 | 0.0000030 |
| CountryIsrael | 84411.9283 | 13467.861 | 6.2676564 | 0.0000000 |
| CountryItaly | 15009.6698 | 11690.808 | 1.2838864 | 0.1992458 |
| CountryJapan | 22445.9919 | 9912.273 | 2.2644647 | 0.0235915 |
| CountryKenya | 4312.6214 | 11135.139 | 0.3872984 | 0.6985531 |
| CountryMalaysia | 341.4790 | 15762.887 | 0.0216635 | 0.9827173 |
| CountryMexico | 8518.8282 | 10240.282 | 0.8318939 | 0.4055117 |
| CountryMorocco | 10769.8410 | 11493.216 | 0.9370607 | 0.3487761 |
| CountryNepal | 12159.2154 | 14825.492 | 0.8201559 | 0.4121694 |
| CountryNetherlands | 32911.9203 | 12615.158 | 2.6089185 | 0.0091120 |
| CountryNigeria | 11215.1186 | 9787.403 | 1.1458728 | 0.2519069 |
| CountryOther | 8877.0562 | 9410.733 | 0.9432906 | 0.3455814 |
| CountryPakistan | 4144.5560 | 9878.063 | 0.4195717 | 0.6748178 |
| CountryPeru | 8525.1296 | 11976.434 | 0.7118254 | 0.4766087 |
| CountryPhilippines | 2886.8810 | 12104.594 | 0.2384947 | 0.8115080 |
| CountryPoland | 20944.6120 | 12369.502 | 1.6932461 | 0.0904758 |
| CountryPortugal | 6404.8958 | 15838.306 | 0.4043927 | 0.6859426 |
| CountryRomania | 5931.5476 | 14541.765 | 0.4078974 | 0.6833679 |
| CountryRussia | 12386.0034 | 10259.910 | 1.2072234 | 0.2274077 |
| CountrySaudi Arabia | 20884.2801 | 14184.278 | 1.4723541 | 0.1409931 |
| CountrySingapore | 33555.9692 | 12901.838 | 2.6008673 | 0.0093284 |
| CountrySouth Africa | 7916.3527 | 11419.120 | 0.6932542 | 0.4881848 |
| CountrySouth Korea | 9116.4124 | 10497.266 | 0.8684559 | 0.3851897 |
| CountrySpain | 23098.4818 | 11100.611 | 2.0808298 | 0.0375042 |
| CountrySri Lanka | -12694.9592 | 16294.252 | -0.7791066 | 0.4359567 |
| CountryTaiwan | 10427.4800 | 10693.585 | 0.9751154 | 0.3295539 |
| CountryThailand | 1394.5859 | 12229.749 | 0.1140323 | 0.9092172 |
| CountryTunisia | 7549.6580 | 13015.713 | 0.5800418 | 0.5619146 |
| CountryTurkey | 1616.1771 | 10331.055 | 0.1564387 | 0.8756940 |
| CountryUkraine | 1237.6520 | 13679.324 | 0.0904761 | 0.9279128 |
| CountryUnited Arab Emirates | 23132.3573 | 12760.481 | 1.8128123 | 0.0699254 |
| CountryUnited Kingdom of Great Britain and Northern Ireland | 51562.5763 | 11518.027 | 4.4766849 | 0.0000078 |
| CountryUnited States of America | 97751.2547 | 9303.987 | 10.5063830 | 0.0000000 |
| CountryViet Nam | -352.9408 | 11003.137 | -0.0320764 | 0.9744125 |
| CountryZimbabwe | 511.2355 | 13919.532 | 0.0367279 | 0.9707035 |
| Student | 750.5338 | 2084.419 | 0.3600686 | 0.7188122 |
| Years.Programming< 1 years | -1719.6036 | 8647.572 | -0.1988539 | 0.8423857 |
| Years.Programming1-3 years | 722.9183 | 8652.776 | 0.0835476 | 0.9334198 |
| Years.Programming10-20 years | 27996.9691 | 8893.703 | 3.1479541 | 0.0016546 |
| Years.Programming20+ years | 33746.7610 | 9112.792 | 3.7032298 | 0.0002153 |
| Years.Programming3-5 years | 4905.9652 | 8744.225 | 0.5610520 | 0.5747892 |
| Years.Programming5-10 years | 8447.6049 | 8785.765 | 0.9615105 | 0.3363457 |
| Years.ProgrammingI have never written code | -248.1520 | 8604.153 | -0.0288410 | 0.9769927 |
| Incorporate.Machine.LearningI do not know | -3960.7968 | 2959.601 | -1.3382872 | 0.1808684 |
| Incorporate.Machine.LearningNo (we do not use ML methods) | -5077.9425 | 2691.129 | -1.8869188 | 0.0592336 |
| Incorporate.Machine.LearningWe are exploring ML methods (and may one day put a model into production) | 2358.5856 | 2851.009 | 0.8272811 | 0.4081203 |
| Incorporate.Machine.LearningWe have well established ML methods (i.e., models in production for more than 2 years) | 29841.0260 | 2994.593 | 9.9649682 | 0.0000000 |
| Incorporate.Machine.LearningWe recently started using ML methods (i.e., models in production for less than 2 years) | 6655.1265 | 3197.036 | 2.0816552 | 0.0374287 |
| Incorporate.Machine.LearningWe use ML methods for generating insights (but do not put working models into production) | 3676.2384 | 3521.212 | 1.0440264 | 0.2965276 |
| ML.Hubs…Repositories.Used TensorFlow Hub | 5550.5443 | 3420.097 | 1.6229199 | 0.1046745 |
| ML.Hubs…Repositories.Used Huggingface Models | 11069.2004 | 4461.641 | 2.4809709 | 0.0131377 |
| ML.Hubs…Repositories.Used Jumpstart | -14603.3567 | 28787.769 | -0.5072764 | 0.6119850 |
| ML.Hubs…Repositories.Used Kaggle datasets | -949.7629 | 2512.312 | -0.3780434 | 0.7054156 |
| ML.Hubs…Repositories.Used NVIDIA NGC models | -1940.7303 | 9985.857 | -0.1943479 | 0.8459120 |
| ML.Hubs…Repositories.Used ONNX models | 6766.3049 | 12398.032 | 0.5457564 | 0.5852597 |
| ML.Hubs…Repositories.Used PyTorch Hub | -3242.6296 | 4623.816 | -0.7012887 | 0.4831581 |
| ML.Hubs…Repositories.Used Timm | 11077.5492 | 8159.729 | 1.3575879 | 0.1746605 |
| ML.Hubs…Repositories.UsedOther storage services (i.e. google drive) | 2981.9750 | 9036.644 | 0.3299870 | 0.7414247 |
| Highest.Level.of.Formal.EducationBachelor’s degree | -704.0327 | 9083.008 | -0.0775110 | 0.9382204 |
| Highest.Level.of.Formal.EducationDoctoral degree | -1337.4739 | 9255.672 | -0.1445032 | 0.8851094 |
| Highest.Level.of.Formal.EducationI prefer not to answer | 1896.7834 | 9359.414 | 0.2026605 | 0.8394093 |
| Highest.Level.of.Formal.EducationMaster’s degree | -2507.5674 | 9042.283 | -0.2773158 | 0.7815500 |
| Highest.Level.of.Formal.EducationNo formal education past high school | 1095.0893 | 9723.973 | 0.1126175 | 0.9103387 |
| Highest.Level.of.Formal.EducationProfessional doctorate | 1815.7385 | 9750.094 | 0.1862278 | 0.8522743 |
| Highest.Level.of.Formal.EducationSome college/university study without earning a bachelor’s degree | -7060.9140 | 9367.752 | -0.7537469 | 0.4510395 |
| Helpful.University | -981.1163 | 1526.991 | -0.6425161 | 0.5205699 |
| Helpful.Online.Courses | 177.7580 | 1393.003 | 0.1276077 | 0.8984650 |
| Helpful.Social.Media | 1782.5842 | 1849.385 | 0.9638796 | 0.3351565 |
| Helpful.Video.Platform | -1476.7613 | 1305.861 | -1.1308714 | 0.2581677 |
| Helpful.Kaggle | -1928.5795 | 1383.019 | -1.3944711 | 0.1632422 |
| Helpful.None | 786.6436 | 3426.703 | 0.2295628 | 0.8184416 |
| Media.on.Social.Twitter | 3385.9781 | 1752.493 | 1.9320927 | 0.0534089 |
| Media.on.Social.Email.Newsletters | 232.0917 | 1737.398 | 0.1335858 | 0.8937359 |
| Media.on.Reddit | 196.0316 | 2042.547 | 0.0959741 | 0.9235453 |
| Media.on.Kaggle | 1302.6609 | 1438.275 | 0.9057109 | 0.3651360 |
| Media.on.Course.Forums | -2217.9694 | 1723.620 | -1.2868086 | 0.1982252 |
| Media.on.Youtube | -2567.4087 | 1381.660 | -1.8582064 | 0.0632030 |
| Media.on.Podcasts | 2010.0364 | 2227.739 | 0.9022763 | 0.3669570 |
| Media.on.Blogs | 333.7197 | 1431.588 | 0.2331116 | 0.8156850 |
| Media.on.Journal.Publications | -399.4962 | 1823.090 | -0.2191313 | 0.8265574 |
| Media.on.Slack.Communities | -1167.1066 | 2352.511 | -0.4961110 | 0.6198396 |
| No.Media.Sources | -3509.0622 | 2905.592 | -1.2076926 | 0.2272271 |
| Data.Science.on.Coursera | -456.0458 | 1435.822 | -0.3176200 | 0.7507875 |
| Data.Science.on.edX | 768.2341 | 2131.088 | 0.3604892 | 0.7184977 |
| Data.Science.on.Kaggle.Learn.Courses | -1021.5180 | 1482.574 | -0.6890166 | 0.4908473 |
| Data.Science.on.DataCamp | -1243.9259 | 1812.741 | -0.6862126 | 0.4926134 |
| Data.Science.on.Fast.ai | 923.0810 | 3275.566 | 0.2818081 | 0.7781033 |
| Data.Science.on.Udacity | 1051.6840 | 2231.709 | 0.4712461 | 0.6374872 |
| Data.Science.on.Udemy | -882.2995 | 1457.959 | -0.6051607 | 0.5451019 |
| Data.Science.on.LinkedIn.Learning | -113.6682 | 1993.927 | -0.0570072 | 0.9545419 |
| Cloud.certification.programs | 1597.7881 | 2311.804 | 0.6911435 | 0.4895099 |
| Data.Science.University.Courses | 3510.0109 | 1588.833 | 2.2091751 | 0.0272112 |
| No.Data.Science.Courses | 123.2877 | 2204.012 | 0.0559379 | 0.9553937 |
| Python | 1857.2647 | 2117.445 | 0.8771254 | 0.3804639 |
| R | 706.2501 | 1637.289 | 0.4313535 | 0.6662314 |
| SQL | 989.1151 | 1371.301 | 0.7212968 | 0.4707633 |
| C | 1512.9581 | 1874.749 | 0.8070190 | 0.4196969 |
| C. | -1880.7577 | 2539.441 | -0.7406188 | 0.4589621 |
| C.. | 2090.7774 | 1779.804 | 1.1747233 | 0.2401657 |
| Java | -1298.6844 | 1761.031 | -0.7374567 | 0.4608820 |
| Javascript | -1436.8006 | 1886.210 | -0.7617393 | 0.4462543 |
| Bash | 2284.5738 | 2515.254 | 0.9082874 | 0.3637736 |
| PHP | -1125.5821 | 2720.452 | -0.4137482 | 0.6790777 |
| MATLAB | 1830.8836 | 2055.728 | 0.8906254 | 0.3731764 |
| Julia | 1779.4135 | 6003.624 | 0.2963899 | 0.7669456 |
| Go | 6962.7998 | 5258.782 | 1.3240329 | 0.1855574 |
| No.Programming.Languages | -3075.5039 | 6153.472 | -0.4997997 | 0.6172398 |
r^2 values are very close to one another, good model
kaggleCon <- read.csv("kaggleContinuous.csv")
#make sure its a factor
kaggleCon <- kaggleCon %>%
mutate(across(c(2:4, 6:14), as.factor))
#filter out the outliers
#selected_contries <- c("United States of America", "Australia", "France", "Canada", "Germany", "Ireland", "Italy", "India", "Japan", "Portugal", "South Korea", "Spain", "Hong Kong (S.A.R.)", "United Arab Emirates","United Kingdom of Great Britain and Northern Ireland" )
kaggleCon <- kaggleCon %>%
filter(Compensation <= 500000) %>%
filter(!Gender == "Prefer to self-describe")
#kaggleCon <- kaggleCon[kaggleCon$Country %in% selected_contries, ]
#deselect inflated variables from dataset
kaggleCon <- kaggleCon %>%
select(-c(Published.Academic.Research.Papers, How.many.individuals.are.responsible, Company.Size,Years.Used.Machine.Learning, Industry.of.Work))
#model
lm <- lm(Compensation~.,data = kaggleCon)
summary(lm)
vif(lm)
predict(lm, newdata = kaggleCon)